This is a knit report to predict sales of Big Mart. The dataset can be found here. Firstly, we load packages required for the project.

## load packages
library(data.table) # used for reading and manipulation of data
library(dplyr)      # used for data manipulation and joining
library(ggplot2)    # used for ploting 
library(caret)      # used for modeling
library(corrplot)   # used for making correlation plot
library(xgboost)    # used for building XGBoost model
library(cowplot)    # used for combining multiple plots 
library(RColorBrewer) # used for setting colors
library(ranger)

Problem Overview

The data scientists at BigMart have collected 2013 sales data for 1559 products across 10 stores in different cities. Also, certain attributes of each product and store have been defined. The aim is to build a predictive model and find out the sales of each product at a particular store. Using this model, BigMart will try to understand the properties of products and stores which play a key role in increasing sales. We have train (8523) and test (5681) data set, train data set has both input and output variable(s). We need to predict the sales for test data set. It’s a regression problem.

## read datasets
setwd("/Users/monica/Documents/[Rutgers]Study/2018fall/AnalyticsBusIntell/GroupAssignment/Final")
train = fread("data/train.csv")
test = fread("data/test.csv")
submission = fread("data/SampleSubmission.csv")[,-3]

## train data dimension
dim(train)
## [1] 8523   12
## test data dimension
dim(test)
## [1] 5681   11
## train data column names
names(train)
##  [1] "Item_Identifier"           "Item_Weight"              
##  [3] "Item_Fat_Content"          "Item_Visibility"          
##  [5] "Item_Type"                 "Item_MRP"                 
##  [7] "Outlet_Identifier"         "Outlet_Establishment_Year"
##  [9] "Outlet_Size"               "Outlet_Location_Type"     
## [11] "Outlet_Type"               "Item_Outlet_Sales"
## test data column names
names(test)
##  [1] "Item_Identifier"           "Item_Weight"              
##  [3] "Item_Fat_Content"          "Item_Visibility"          
##  [5] "Item_Type"                 "Item_MRP"                 
##  [7] "Outlet_Identifier"         "Outlet_Establishment_Year"
##  [9] "Outlet_Size"               "Outlet_Location_Type"     
## [11] "Outlet_Type"
## structure of train data
str(train)
## Classes 'data.table' and 'data.frame':   8523 obs. of  12 variables:
##  $ Item_Identifier          : chr  "FDA15" "DRC01" "FDN15" "FDX07" ...
##  $ Item_Weight              : num  9.3 5.92 17.5 19.2 8.93 ...
##  $ Item_Fat_Content         : chr  "Low Fat" "Regular" "Low Fat" "Regular" ...
##  $ Item_Visibility          : num  0.016 0.0193 0.0168 0 0 ...
##  $ Item_Type                : chr  "Dairy" "Soft Drinks" "Meat" "Fruits and Vegetables" ...
##  $ Item_MRP                 : num  249.8 48.3 141.6 182.1 53.9 ...
##  $ Outlet_Identifier        : chr  "OUT049" "OUT018" "OUT049" "OUT010" ...
##  $ Outlet_Establishment_Year: int  1999 2009 1999 1998 1987 2009 1987 1985 2002 2007 ...
##  $ Outlet_Size              : chr  "Medium" "Medium" "Medium" "" ...
##  $ Outlet_Location_Type     : chr  "Tier 1" "Tier 3" "Tier 1" "Tier 3" ...
##  $ Outlet_Type              : chr  "Supermarket Type1" "Supermarket Type2" "Supermarket Type1" "Grocery Store" ...
##  $ Item_Outlet_Sales        : num  3735 443 2097 732 995 ...
##  - attr(*, ".internal.selfref")=<externalptr>
## structure of test data
str(test)
## Classes 'data.table' and 'data.frame':   5681 obs. of  11 variables:
##  $ Item_Identifier          : chr  "FDW58" "FDW14" "NCN55" "FDQ58" ...
##  $ Item_Weight              : num  20.75 8.3 14.6 7.32 NA ...
##  $ Item_Fat_Content         : chr  "Low Fat" "reg" "Low Fat" "Low Fat" ...
##  $ Item_Visibility          : num  0.00756 0.03843 0.09957 0.01539 0.1186 ...
##  $ Item_Type                : chr  "Snack Foods" "Dairy" "Others" "Snack Foods" ...
##  $ Item_MRP                 : num  107.9 87.3 241.8 155 234.2 ...
##  $ Outlet_Identifier        : chr  "OUT049" "OUT017" "OUT010" "OUT017" ...
##  $ Outlet_Establishment_Year: int  1999 2007 1998 2007 1985 1997 2009 1985 2002 2007 ...
##  $ Outlet_Size              : chr  "Medium" "" "" "" ...
##  $ Outlet_Location_Type     : chr  "Tier 1" "Tier 2" "Tier 3" "Tier 2" ...
##  $ Outlet_Type              : chr  "Supermarket Type1" "Supermarket Type1" "Grocery Store" "Supermarket Type1" ...
##  - attr(*, ".internal.selfref")=<externalptr>

As we can see, we have two identifiers, Item_Identifier and Outlet_Identifier, which uniquely identify each row together. We still have five categorical variables and five numeric variables. Item_Outlet_Sales only appears in train set, which is our target variable. We consider combine both train and test data sets into one, perform feature engineering and then divide them later again.

test[,Item_Outlet_Sales := NA] ## add Item_Outlet_Sales to test data

combi = rbind(train, test) # combining train and test datasets for data preprocessing

Explorary Data Analysis (EDA)

Now we will perform some basic data exploration. We divide the process into univariate and bivariate in order to explore the distribution of variables and relationship among them. In this report, I will focus on technical process. Our data analyst will delve deeper into business insights.

Univariate

We will try to visualize the continuous variables using histograms and categorical variables using bar plots. The distributions of Item_Outlet_Sales and Item_Visibility are skewed, which means we should perform some transformation.

# distribution of Item_Outlet_Sales
ggplot(train) + 
  geom_histogram(aes(train$Item_Outlet_Sales), binwidth = 100, fill = brewer.pal(7, "Set3")[6])+
  xlab("Item_Outlet_Sales")

# distribution of Item_Weight
p1 = ggplot(combi) + 
  geom_histogram(aes(Item_Weight), binwidth = 0.5, fill = brewer.pal(7, "Accent")[5])
# distribution of Item_Weight
p2 = ggplot(combi) +
  geom_histogram(aes(Item_Visibility), binwidth = 0.005, fill = brewer.pal(7, "Accent")[5])
# distribution of Item_Weight
p3 = ggplot(combi) + geom_histogram(aes(Item_MRP), binwidth = 1, fill = brewer.pal(7, "Accent")[5])
# put into one picture
plot_grid(p1, p2, p3, nrow = 1) # plot_grid() from cowplot package
## Warning: Removed 2439 rows containing non-finite values (stat_bin).

Note that there are a lot of unreasonably zeros in Item_Visibility variable. We can treat them as missing values.
For Item_Fat_Content, ‘LF’, ‘low fat’, and ‘Low Fat’ are the same category and can be combined into one, as well as ‘reg’ and ‘Regular’.

# boxplot before combination
ggplot(combi %>% group_by(Item_Fat_Content) %>% summarise(Count = n())) + 
  geom_bar(aes(Item_Fat_Content, Count), stat = "identity", fill = brewer.pal(7, "Accent")[5])

# combine the categories with the same meaning
combi$Item_Fat_Content[combi$Item_Fat_Content == "LF"] = "Low Fat"
combi$Item_Fat_Content[combi$Item_Fat_Content == "low fat"] = "Low Fat"
combi$Item_Fat_Content[combi$Item_Fat_Content == "reg"] = "Regular"

# boxplot after combination
ggplot(combi %>% group_by(Item_Fat_Content) %>% summarise(Count = n())) + 
  geom_bar(aes(Item_Fat_Content, Count), stat = "identity", fill = brewer.pal(7, "Set3")[6],width = 0.5)

Check other categorical variables. There are 4016 missing values in Outlet_Size. We need to impute them before modeling.

# Item_Type
p4 = ggplot(combi %>% group_by(Item_Type) %>% summarise(Count = n())) + 
  geom_bar(aes(Item_Type, Count), stat = "identity", fill = brewer.pal(7, "Accent")[5]) +
  xlab("") +
  geom_label(aes(Item_Type, Count, label = Count), vjust = 0.5) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  ggtitle("Item_Type")
# Outlet_Identifier
p5 = ggplot(combi %>% group_by(Outlet_Identifier) %>% summarise(Count = n())) + 
  geom_bar(aes(Outlet_Identifier, Count), stat = "identity", fill = brewer.pal(7, "Set3")[6]) +
  geom_label(aes(Outlet_Identifier, Count, label = Count), vjust = 0.5) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Outlet_Size
p6 = ggplot(combi %>% group_by(Outlet_Size) %>% summarise(Count = n())) + 
  geom_bar(aes(Outlet_Size, Count), stat = "identity", fill = brewer.pal(7, "Set3")[6]) +
  geom_label(aes(Outlet_Size, Count, label = Count), vjust = 0.5) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
# put three plots together
second_row = plot_grid(p5, p6, nrow = 1)
plot_grid(p4, second_row, ncol = 1)

# Outlet_Establishment_Year
p7 = ggplot(combi %>% group_by(Outlet_Establishment_Year) %>% summarise(Count = n())) + 
  geom_bar(aes(factor(Outlet_Establishment_Year), Count), 
           stat = "identity", fill = brewer.pal(7, "Set3")[6]) +
  geom_label(aes(factor(Outlet_Establishment_Year), Count, label = Count), vjust = 0.5) +
  xlab("Outlet_Establishment_Year") +
  theme(axis.text.x = element_text(size = 8.5))
# plot for Outlet_Type
p8 = ggplot(combi %>% group_by(Outlet_Type) %>% summarise(Count = n())) + 
  geom_bar(aes(Outlet_Type, Count), stat = "identity", fill = brewer.pal(7, "Accent")[5]) +
  geom_label(aes(factor(Outlet_Type), Count, label = Count), vjust = 0.5) +
  theme(axis.text.x = element_text(size = 8.5))
# plot both together
plot_grid(p7, p8, ncol = 2)

Bivariate

For bivariate analysis, we hope to explore the relationship between predictors to the target (Item_Outlet_Sales). We will use scatter plots for numeric variables and violin plots for the categorical variables.

train = combi[1:nrow(train)]

# Item_Weight vs Item_Outlet_Sales
p9 = ggplot(train) + 
  geom_point(aes(Item_Weight, Item_Outlet_Sales), colour = brewer.pal(7, "GnBu")[6], alpha = 0.3) +
     theme(axis.title = element_text(size = 8.5))
# Item_Visibility vs Item_Outlet_Sales
p10 = ggplot(train) + 
  geom_point(aes(Item_Visibility, Item_Outlet_Sales), colour = brewer.pal(7, "GnBu")[6], alpha = 0.3) +
      theme(axis.title = element_text(size = 8.5))
# Item_MRP vs Item_Outlet_Sales
p11 = ggplot(train) + 
  geom_point(aes(Item_MRP, Item_Outlet_Sales), colour = brewer.pal(7, "GnBu")[6], alpha = 0.3) +
      theme(axis.title = element_text(size = 8.5))
# combine together
second_row_2 = plot_grid(p10, p11, ncol = 2)
plot_grid(p9, second_row_2, nrow = 2)
## Warning: Removed 1463 rows containing missing values (geom_point).

Note that in the plot of Item_MRP vs Item_Outlet_Sales, there are clearly 4 segments. We will use the observation later in feature engineering.

# Item_Type vs Item_Outlet_Sales
p12 = ggplot(train) + geom_violin(aes(Item_Type, Item_Outlet_Sales), fill = brewer.pal(7, "Set3")[6]) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.text = element_text(size = 6),
        axis.title = element_text(size = 8.5))
# Item_Fat_Content vs Item_Outlet_Sales
p13 = ggplot(train) + 
  geom_violin(aes(Item_Fat_Content, Item_Outlet_Sales), fill = brewer.pal(7, "Set3")[6]) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.text = element_text(size = 8),
        axis.title = element_text(size = 8.5))
# Outlet_Identifier vs Item_Outlet_Sales
p14 = ggplot(train) + 
  geom_violin(aes(Outlet_Identifier, Item_Outlet_Sales), fill = brewer.pal(7, "Set3")[6]) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.text = element_text(size = 8),
        axis.title = element_text(size = 8.5))
# combine together
second_row_3 = plot_grid(p13, p14, ncol = 2)
plot_grid(p12, second_row_3, ncol = 1)

# Outlet_Size vs Item_Outlet_Sales
ggplot(train) + 
  geom_violin(aes(Outlet_Size, Item_Outlet_Sales), fill = brewer.pal(7, "GnBu")[6])

In above plot, the distribution of Item_Outlet_Sales against blank size is identical with that against small size. So we consider impute the missing values in Outlet_Size with “Small”.

# Outlet_Location_Type vs Item_Outlet_Sales
p15 = ggplot(train) + 
  geom_violin(aes(Outlet_Location_Type, Item_Outlet_Sales), fill = brewer.pal(7, "Set3")[6])
# Outlet_Type vs Item_Outlet_Sales
p16 = ggplot(train) + 
  geom_violin(aes(Outlet_Type, Item_Outlet_Sales), fill = brewer.pal(7, "Set3")[6])
plot_grid(p15, p16, ncol = 1)

Data Preprocessing

Firstly, let’s look at some basic statistics of attributes. Item_Outlet_Sales is our target variable that we need to predict, namely the 5681 missing values in test set. Besides, we still have some missing values to impute. All categorical variables are characters. So we need to transform them into factor. However, there are too many factors, so we need to perform some feature engineering and encoding.

summary(combi)
##  Item_Identifier     Item_Weight     Item_Fat_Content   Item_Visibility  
##  Length:14204       Min.   : 4.555   Length:14204       Min.   :0.00000  
##  Class :character   1st Qu.: 8.710   Class :character   1st Qu.:0.02704  
##  Mode  :character   Median :12.600   Mode  :character   Median :0.05402  
##                     Mean   :12.793                      Mean   :0.06595  
##                     3rd Qu.:16.750                      3rd Qu.:0.09404  
##                     Max.   :21.350                      Max.   :0.32839  
##                     NA's   :2439                                         
##   Item_Type            Item_MRP      Outlet_Identifier 
##  Length:14204       Min.   : 31.29   Length:14204      
##  Class :character   1st Qu.: 94.01   Class :character  
##  Mode  :character   Median :142.25   Mode  :character  
##                     Mean   :141.00                     
##                     3rd Qu.:185.86                     
##                     Max.   :266.89                     
##                                                        
##  Outlet_Establishment_Year Outlet_Size        Outlet_Location_Type
##  Min.   :1985              Length:14204       Length:14204        
##  1st Qu.:1987              Class :character   Class :character    
##  Median :1999              Mode  :character   Mode  :character    
##  Mean   :1998                                                     
##  3rd Qu.:2004                                                     
##  Max.   :2009                                                     
##                                                                   
##  Outlet_Type        Item_Outlet_Sales 
##  Length:14204       Min.   :   33.29  
##  Class :character   1st Qu.:  834.25  
##  Mode  :character   Median : 1794.33  
##                     Mean   : 2181.29  
##                     3rd Qu.: 3101.30  
##                     Max.   :13086.97  
##                     NA's   :5681

Missing Value Treatment

There are 2439 missing values in Item_Weight, we impute them with the mean weight of the same product item. It makes sense because items of the same product usually have similiar or even the same weights.

missing_index = which(is.na(combi$Item_Weight))
for(i in missing_index){
  item = combi$Item_Identifier[i]
  combi$Item_Weight[i] = mean(combi$Item_Weight[combi$Item_Identifier == item], na.rm = T)
}
# Check if there is still any missing data in Item_Weight.
sum(is.na(combi$Item_Weight))
## [1] 0

Similiarly, we treat zeros in Item_Visibility as missing values and impute them with the mean value of the same product.

# replacing 0 in Item_Visibility with mean
zero_index = which(combi$Item_Visibility == 0)
for(i in zero_index){
  item = combi$Item_Identifier[i]
  combi$Item_Visibility[i] = mean(combi$Item_Visibility[combi$Item_Identifier == item], na.rm = T)
}
# Check the distribution of Item_Visibility
ggplot(combi) + geom_histogram(aes(Item_Visibility), bins = 100,fill = brewer.pal(7, "Accent")[5])

Feature Engineering

Some categorical variables have so many categories that it’s impossible to fit models on them. So we can extract some information related to sales prediction and create some new features to replace them. Firstly, we can look at the Item_Type variable and create a new variable to classify the type categories into perishable and non_perishable.

# create a new feature 'Item_Type_new' (replacing Item_Type)
table(combi$Item_Type)
## 
##          Baking Goods                Breads             Breakfast 
##                  1086                   416                   186 
##                Canned                 Dairy          Frozen Foods 
##                  1084                  1136                  1426 
## Fruits and Vegetables           Hard Drinks    Health and Hygiene 
##                  2013                   362                   858 
##             Household                  Meat                Others 
##                  1548                   736                   280 
##               Seafood           Snack Foods           Soft Drinks 
##                    89                  1989                   726 
##         Starchy Foods 
##                   269
perishable = c("Breads", "Breakfast", "Dairy", "Fruits and Vegetables", "Meat", "Seafood")
non_perishable = c("Baking Goods", "Canned", "Frozen Foods", "Hard Drinks", "Health and Hygiene",
                   "Household", "Soft Drinks")
combi[,Item_Type_new := ifelse(Item_Type %in% perishable, "perishable",
                               ifelse(Item_Type %in% non_perishable, "non_perishable", "not_sure"))]

We find that the first two characters of Item_Identifier are “DR”, “FR” and “NC”, which may probably represent drink, fruite and non-consumable. We compare these two characters with Item_Type, which validates our assumption. So we create another variable Item_category. In this way, it is unreasonable to have “NC” items with “low fat”, so we change it to “Non-Edible”. We also create two more features: Outlet_Years and price_per_unit_wt.

# compare 'Item_Type' with the first two characters of 'Item_Identifier'
table(combi$Item_Type, substr(combi$Item_Identifier, 1, 2))
##                        
##                           DR   FD   NC
##   Baking Goods             0 1086    0
##   Breads                   0  416    0
##   Breakfast                0  186    0
##   Canned                   0 1084    0
##   Dairy                  229  907    0
##   Frozen Foods             0 1426    0
##   Fruits and Vegetables    0 2013    0
##   Hard Drinks            362    0    0
##   Health and Hygiene       0    0  858
##   Household                0    0 1548
##   Meat                     0  736    0
##   Others                   0    0  280
##   Seafood                  0   89    0
##   Snack Foods              0 1989    0
##   Soft Drinks            726    0    0
##   Starchy Foods            0  269    0
# create a new feature 'Item_category'
combi[,Item_category := substr(combi$Item_Identifier, 1, 2)]

# change the 'Item_Fat_Content' value for items in "NC" category
combi$Item_Fat_Content[combi$Item_category == "NC"] = "NonEdible"

# years of operation of outlets (replacing 'Outlet_Establishment_Year')
combi[,Outlet_Years := 2013 - Outlet_Establishment_Year]
combi$Outlet_Establishment_Year = as.factor(combi$Outlet_Establishment_Year)

# Price per unit weight
combi[,price_per_unit_wt := Item_MRP/Item_Weight]

In the EDA section, we have found that there are four segmentations in Item_MRP vs Item_Outlet_Sales plot. So we create a new variable Item_MRP_Clusters with four categories.

# creating new independent variable 'Item_MRP_clusters'
combi[,Item_MRP_clusters := ifelse(Item_MRP < 69, "1st", 
                                   ifelse(Item_MRP >= 69 & Item_MRP < 136, "2nd",
                                          ifelse(Item_MRP >= 136 & Item_MRP < 203, "3rd", "4th")))]

Category Encoding

Most of the machine learning algorithms produce better result with numerical variables only. So we convert categorical variables into numerical ones through two encoding techniques, label encoding and one hot encoding. Label encoding simply converts each category in a variable to a number. It is more suitable for ordinal variables. Categorical variables with some order. One hot encoding convert each category into a new binary column (1/0).

## Label Encoding
combi[,Outlet_Size_num := ifelse(Outlet_Size == "Small", 0,
                                 ifelse(Outlet_Size == "Medium", 1, 2))]
combi[,Outlet_Location_Type_num := ifelse(Outlet_Location_Type == "Tier 3", 0,
                                          ifelse(Outlet_Location_Type == "Tier 2", 1, 2))]
# remove categorical variables after label encoding
combi[, c("Outlet_Size", "Outlet_Location_Type") := NULL]

## One Hot Encoding
ohe = dummyVars("~.", data = combi[,-c("Item_Identifier", "Outlet_Establishment_Year", "Item_Type")], fullRank = T)
ohe_df = data.table(predict(ohe, combi[,-c("Item_Identifier", "Outlet_Establishment_Year", "Item_Type")]))
combi = cbind(combi[,"Item_Identifier"], ohe_df)

Variable transformation

For variables with skewed distributions, we consider transform them by taking log.

combi[,Item_Visibility := log(Item_Visibility + 1)] # log + 1 to avoid division by zero
ggplot(combi) +
  geom_histogram(aes(Item_Visibility), binwidth = 0.005, fill = brewer.pal(7, "Accent")[5])

combi[,price_per_unit_wt := log(price_per_unit_wt + 1)]
ggplot(combi) +
  geom_histogram(aes(price_per_unit_wt), binwidth = 0.005, fill = brewer.pal(7, "Accent")[5])

Scaling and Centering

We hope to scale and center the numeric variables to make them have a mean of zero, standard deviation of one and scale of 0 to 1. Scaling and centering is required for linear regression models.

# only for numerical variables
num_vars = which(sapply(combi, is.numeric))
num_vars_names = names(num_vars)
# exclude the target variable
combi_numeric = combi[,setdiff(num_vars_names, "Item_Outlet_Sales"), with = F]

# scaling and centering data
prep_num = preProcess(combi_numeric, method=c("center", "scale"))
combi_numeric_norm = predict(prep_num, combi_numeric)

combi[,setdiff(num_vars_names, "Item_Outlet_Sales") := NULL] # removing numeric independent variables
combi = cbind(combi, combi_numeric_norm)

# splitting data back to train and test
train = combi[1:nrow(train)]
test = combi[(nrow(train) + 1):nrow(combi)]
test[,Item_Outlet_Sales := NULL] # removing Item_Outlet_Sales as it contains only NA for test dataset
fwrite(train,"data/modified_train.csv")
fwrite(test,"data/modified_test.csv")

Correlation Plot

cor_train = cor(train[,-c("Item_Identifier")])
corrplot(cor_train, type = "lower",method="pie", tl.pos = "ld",
         tl.cex = 0.8,tl.col=brewer.pal(7, "Accent")[5])

#corrplot(cor_train, add= TRUE, type = "upper", tl.pos = "n",cl.pos="n", 
#         method = "number",tl.cex = 0.8,tl.col=brewer.pal(7, "Accent")[5])

Modeling

We apply the following models:
* Linear regression * Lasso Regression * Ridge Regression * RandomForest * XGBoost

To evaluate the model, we calculate the root mean squared error (RMSE) score for each model and compare the score with the baseline model. The smaller the score, the better our model will be. Our baseline model predicts the sale as overall average sale.

train<-fread("data/modified_train.csv")
test<-fread("data/modified_test.csv")
submission$Item_Outlet_Sales = mean(train[['Item_Outlet_Sales']])
fwrite(submission, "data/Baseline_submit.csv", row.names = F)
# rmse score
 sqrt(mean((mean(train[['Item_Outlet_Sales']])-train[['Item_Outlet_Sales']])^2))
## [1] 1706.4

Linear Regression

Firstly, we fit a multiple linear regression model and use backward elimination method to cut off insignificant predictors.

linear_reg_mod = lm(Item_Outlet_Sales ~ ., data = train[,-c("Item_Identifier")])
summary(linear_reg_mod)
## 
## Call:
## lm(formula = Item_Outlet_Sales ~ ., data = train[, -c("Item_Identifier")])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4299.1  -674.9   -89.8   575.2  7954.9 
## 
## Coefficients: (7 not defined because of singularities)
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    2181.3801    12.2297 178.367   <2e-16 ***
## Item_Weight                     -45.8180    51.9934  -0.881    0.378    
## Item_Fat_ContentNonEdible        -1.7959    19.4101  -0.093    0.926    
## Item_Fat_ContentRegular          19.5174    13.5575   1.440    0.150    
## Item_Visibility                 -10.8684    12.7691  -0.851    0.395    
## Item_MRP                        985.9094    77.5806  12.708   <2e-16 ***
## Outlet_IdentifierOUT013         605.7685    19.1570  31.621   <2e-16 ***
## Outlet_IdentifierOUT017         626.6733    19.1122  32.789   <2e-16 ***
## Outlet_IdentifierOUT018         509.2597    19.1173  26.639   <2e-16 ***
## Outlet_IdentifierOUT019           4.1532    16.5522   0.251    0.802    
## Outlet_IdentifierOUT027        1050.6553    19.1944  54.738   <2e-16 ***
## Outlet_IdentifierOUT035         640.7775    19.1395  33.479   <2e-16 ***
## Outlet_IdentifierOUT045         574.0976    19.1379  29.998   <2e-16 ***
## Outlet_IdentifierOUT046         595.7384    19.1346  31.134   <2e-16 ***
## Outlet_IdentifierOUT049         625.9740    19.1333  32.716   <2e-16 ***
## `Outlet_TypeSupermarket Type1`        NA         NA      NA       NA    
## `Outlet_TypeSupermarket Type2`        NA         NA      NA       NA    
## `Outlet_TypeSupermarket Type3`        NA         NA      NA       NA    
## Item_Type_newnot_sure            -0.9693    13.7456  -0.071    0.944    
## Item_Type_newperishable           6.8906    14.4870   0.476    0.634    
## Item_categoryFD                   7.8201    20.7545   0.377    0.706    
## Item_categoryNC                       NA         NA      NA       NA    
## Outlet_Years                          NA         NA      NA       NA    
## price_per_unit_wt               -74.6903    82.7352  -0.903    0.367    
## Item_MRP_clusters2nd             27.1958    32.6733   0.832    0.405    
## Item_MRP_clusters3rd             55.5387    50.9009   1.091    0.275    
## Item_MRP_clusters4th             43.2283    56.7582   0.762    0.446    
## Outlet_Size_num                       NA         NA      NA       NA    
## Outlet_Location_Type_num              NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1129 on 8501 degrees of freedom
## Multiple R-squared:  0.5636, Adjusted R-squared:  0.5625 
## F-statistic: 522.8 on 21 and 8501 DF,  p-value: < 2.2e-16
## predicting on test set and writing a submission file
submission$Item_Outlet_Sales = predict(linear_reg_mod, test[,-c("Item_Identifier")])
## Warning in predict.lm(linear_reg_mod, test[, -c("Item_Identifier")]):
## prediction from a rank-deficient fit may be misleading
# fwrite(submission, "data/Linear_Reg_submit.csv", row.names = F)

# rmse score for train set
sqrt(mean((fitted(linear_reg_mod)-train[['Item_Outlet_Sales']])^2))
## [1] 1127.269

When we plot the coefficients, we can see there are several missing values results from multicolinearity.

coef<-data.frame(linear_reg_mod$coefficients,names(linear_reg_mod$coefficients))
 colnames(coef)<-c("coefficient","attribute")
 #coef[is.na(coef)]<-0
 coef<-coef[order(coef$coefficient),]
 a<-rownames(coef)
 coef$attribute=factor(coef$attribute,levels=a)
 ggplot(coef[-nrow(coef),],aes(x=attribute,y=coefficient))+
   geom_bar(stat='identity', fill = brewer.pal(7, "Set3")[6],width = 0.5)+
   theme(axis.text.x = element_text(size=6,angle = 90),axis.text.y = element_text(size=6))
## Warning: Removed 6 rows containing missing values (position_stack).

Penalty-Based Variable Selection in Regression Models

Since the regression model has a large number of covariates as well as categorical variables, we consider some penalty-based estimation approaches to handle the correlated predictors and to get rid of overfitting. The first method is least absolute shrinkage and selection operator (LASSO) algorithm. Through applying a penalty parameter to constrain the sum of absolute coefficients, LASSO can shrinkage the estimates and lead to variable selection and a simplification of the model. Large values of penalty parameter lead to large shrinkage and small values result in little shrinkage. Therefore, we need to select the penalty parameter. Here we use 5-fold cross validation.

set.seed(1235)
my_control = trainControl(method="cv", number=5)
# select penalty parameter from 0.001 to 0.1
Grid = expand.grid(alpha = 1, lambda = seq(0,10,by = 0.01))
# train model through 5-fold cross validation
lasso_linear_reg_mod = train(x = train[, -c("Item_Identifier", "Item_Outlet_Sales")], 
                             y = train$Item_Outlet_Sales,
                             method='glmnet', trControl= my_control, tuneGrid = Grid)

# rmse score for train set
(rmse_lasso=mean(lasso_linear_reg_mod$resample$RMSE))
## [1] 1129.823
## predicting on test set and writing a submission file
submission$Item_Outlet_Sales = predict(lasso_linear_reg_mod, test[,-c("Item_Identifier")])
#fwrite(submission, "data/Lasso_Reg_submit.csv", row.names = F)

# plot parameter tuning
plot(lasso_linear_reg_mod)

# plot coefficients
a<-coef(lasso_linear_reg_mod$finalModel)[,69]
coef<-data.frame(a,names(a))
 colnames(coef)<-c("coefficient","attribute")
 coef[is.na(coef)]<-0
 coef<-coef[order(coef$coefficient),]
 a<-rownames(coef)
 coef$attribute=factor(coef$attribute,levels=a)
 ggplot(coef[-nrow(coef),],aes(x=attribute,y=coefficient))+
   geom_bar(stat='identity', fill = brewer.pal(7, "Set3")[6],width = 0.5)+
   theme(axis.text.x = element_text(size=6,angle = 90),axis.text.y = element_text(size=6))

 # plot importance of predictors 
plot(varImp(lasso_linear_reg_mod))

Ridge Regression is similiar to LASSO except that it limits the sum of squared coefficients rather than absolute coefficients.

set.seed(1235)
my_control = trainControl(method="cv", number=5)
# select penalty parameter from 0.001 to 0.1
Grid = expand.grid(alpha = 0, lambda = seq(0,20,by = 0.1))
# train model through 5-fold cross validation
ridge_linear_reg_mod = train(x = train[, -c("Item_Identifier", "Item_Outlet_Sales")], 
                             y = train$Item_Outlet_Sales,
                             method='glmnet', trControl= my_control, tuneGrid = Grid)

# rmse score for train set
(rmse_ridge=mean(ridge_linear_reg_mod$resample$RMSE))
## [1] 1135.36
a<-coef(ridge_linear_reg_mod$finalModel)[,100]
coef<-data.frame(a,names(a))
 colnames(coef)<-c("coefficient","attribute")
 coef[is.na(coef)]<-0
 coef<-coef[order(coef$coefficient),]
 a<-rownames(coef)
 coef$attribute=factor(coef$attribute,levels=a)
 ggplot(coef[-nrow(coef),],aes(x=attribute,y=coefficient))+
   geom_bar(stat='identity', fill = brewer.pal(7, "Set3")[6],width = 0.5)+
   theme(axis.text.x = element_text(size=6,angle = 90),axis.text.y = element_text(size=6))

 # plot importance of coefficients
plot(varImp(ridge_linear_reg_mod))

## predicting on test set and writing a submission file
submission$Item_Outlet_Sales = predict(ridge_linear_reg_mod, test[,-c("Item_Identifier")])
#fwrite(submission, "data/Ridge_Reg_submit.csv", row.names = F)

RandomForest Model

We build a RandomForest model with 400 trees and run 5-fold cross validation to select some tuning parameters including mtry (no. of predictor variables randomly sampled at each split) and min.node.size (minimum size of terminal nodes).

set.seed(1237)
my_control = trainControl(method="cv", number=5)

tgrid = expand.grid(
  .mtry = c(3:10),
  .splitrule = "variance",
  .min.node.size =c(10,15,20)
)

# rf_start_time<-Sys.time()
# 
# rf_mod = train(x = train[, -c("Item_Identifier", "Item_Outlet_Sales")],
#               y = train$Item_Outlet_Sales,
#               method='ranger',
#               trControl= my_control,
#               tuneGrid = tgrid,
#               num.trees = 400,
#               importance = "permutation")
# rf_end_time<-Sys.time()
# (rf_time<-rf_end_time-rf_start_time)
# 
# save(rf_mod, file="data/rf_mod.Rdata")
load("data/rf_mod.Rdata")

# rmse score
(mean(rf_mod$resample$RMSE))
## [1] 1088.306
## plot displaying RMSE scores for different tuning parameters
plot(rf_mod)

## plot variable importance
plot(varImp(rf_mod))

# show the results of parameter selection
# number of predictors
rf_mod$bestTune$mtry
## [1] 6
# minimum node size
rf_mod$bestTune$min.node.size
## [1] 20
## predicting on test set and writing a submission file
submission$Item_Outlet_Sales = predict(rf_mod, test[,-c("Item_Identifier")])
# fwrite(submission, "data/Rf_submit.csv", row.names = F)

XGBoost

XGBoost is an advanced gradient boosting algorithm which consider a tradeoff between prediction loss and complexity in a fast way. It combines several week learners to form a strong learner through an iterative process. Initially, the algorithm starts from a single base leaner and add a new learner in each iterative based on the residuals of previous learners. So the iterative process is sequencial which may cost much time to converge. However, XGBoost implements parallel computing to ensure time efficiency. Both linear model and tree models can be served as a base learner. Here we choose gbtree by default because tree models usually perfom better. Moreover, we can run a cross-validation at each iteration to get the exact optimum number of boosting iterations in a single run.

## List of initial parameters for XGBoost modeling
param_list = list(
        objective = "reg:linear",# regression problem
        eta=0.1,# learning rate, shrinks the feature weights to reduce complexity
        gamma = 0, # the minimum loss reduction required to make a split
        max_depth=5,# maximum depth of each tree
        subsample=0.8,# subsample percentage for training each tree
        colsample_bytree=0.8,#percentage of features selected randomly for training each tree
        seed = 112
        )

## converting train and test into xgb.DMatrix format
dtrain = xgb.DMatrix(data = as.matrix(train[,-c("Item_Identifier", 
                                                "Item_Outlet_Sales")]), 
                     label= train$Item_Outlet_Sales)
dtest = xgb.DMatrix(data = as.matrix(test[,-c("Item_Identifier")]))

## 5-fold cross-validation to find optimal value of nrounds
set.seed(112)
xgb_start_time <- Sys.time()
xgbcv = xgb.cv(params = param_list, 
               data = dtrain, 
               nrounds = 1000, # maximun number of iteration
               nfold = 5, 
               print_every_n = 1000000, 
               early_stopping_rounds = 30, # stop if the performance doesn't improve for 30 rounds
               maximize = F)
## [1]  train-rmse:2535.855176+6.322535 test-rmse:2537.046142+30.117226 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 30 rounds.
## 
## Stopping. Best iteration:
## [40] train-rmse:1000.742297+4.666638 test-rmse:1088.162940+17.513365
xgbcv$best_iteration
## [1] 40
## training XGBoost model at nrounds = 430
xgb_model = xgb.train(data = dtrain, 
                      params = param_list,
                      watchlist <- list(train=dtrain),
                      print_every_n = 100000,
                      nrounds = xgbcv$best_iteration)
## [1]  train-rmse:2536.204834 
## [40] train-rmse:1012.152588
# rmse score for train set
(rmse_xgb=xgb_model$evaluation_log$train_rmse[xgbcv$best_iteration])
## [1] 1012.153
## Variable Importance
var_imp = xgb.importance(feature_names = setdiff(names(train), c("Item_Identifier", "Item_Outlet_Sales")), 
                         model = xgb_model)
xgb.plot.importance(var_imp)

From the above result, we can find that the train performance by XGBoost is outstanding (The rmse score is 1012). However, the result is unreliabel because we only run the program onece and the test performance on the LeaderBoard is not ideal (only 1202, worse than random forest). So we consider run a cross validation to tune parameters and obtain a reliable estimation. We use grid search method to find the optimal parameters.

# set up the cross-validated hyper-parameter search
xgb_grid = expand.grid(
  nrounds = 42,
  eta=0.1,# learning rate; the smaller, the more conservative
  max_depth = 3:10,# maximum depth of each tree; the smaller, the more conservative
  min_child_weight=5:11,# the larger, the more conservative
  gamma = 0,# the minimum loss reduction required to make a split; the larger, the more conservative
  subsample=0.8, # subsample percentage for training each tree; the smaller, the more conservative
  colsample_bytree=0.8#percentage of features selected randomly for training each tree; the smaller, the more conservative
  )
# pack the training control parameters
xgb_trcontrol_1 = trainControl(
  method = "cv",
  number = 5,
  allowParallel = TRUE
)

# train the model for each parameter combination in the grid, 
#   using CV to evaluate
xgb_start_time=Sys.time()
set.seed(112)
xgb_train = train(
  x = train[, -c("Item_Identifier", "Item_Outlet_Sales")],
  y = train$Item_Outlet_Sales,
  trControl = xgb_trcontrol_1,
  tuneGrid = xgb_grid,
  method = "xgbTree"
)
xgb_end_time=Sys.time()
(xgb_time<-xgb_end_time-xgb_start_time)
## Time difference of 3.20594 mins
# prediction
set.seed(112)
submission$Item_Outlet_Sales = predict(xgb_train, test[,-c("Item_Identifier")])
#fwrite(submission, "data/Xgb__.csv", row.names = F)

# calculate rmse score
mean(xgb_train$resample$RMSE)
## [1] 1088.965

This time the rmse score is 1088 and the test score on the LeaderBoard is 1157. The score is very much better than before and close to random forest. However, the performance is still worse than random forest. It’s probably because we havn’t pruned parameters of XGBoost appropriatly. We will perform more grid search in futher study.